#importing packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
#import data
df=pd.read_csv("netflix_titles.csv")
df.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
#examine the data
df.count()
show_id 8807 type 8807 title 8807 director 6173 cast 7982 country 7976 date_added 8797 release_year 8807 rating 8803 duration 8804 listed_in 8807 description 8807 dtype: int64
# Missing data
for i in df.columns:
null_rate = df[i].isna().sum() / len(df) * 100
if null_rate > 0 :
print("{} null rate: {}%".format(i,round(null_rate,2)))
director null rate: 29.91% cast null rate: 9.37% country null rate: 9.44% date_added null rate: 0.11% rating null rate: 0.05% duration null rate: 0.03%
# Dealing with Missing Data
# replacing missing data with mode
df['country'] = df['country'].fillna(df['country'].mode()[0])
# replacing missing string data with appropriate labels
df['cast'].replace(np.nan, 'No Data',inplace = True)
df['director'].replace(np.nan, 'No Data',inplace = True)
# Dropping other NAs
df.dropna(inplace=True)
# Dropping Duplicates
df.drop_duplicates(inplace= True)
#dividing into shows and movies
showz=df[df['type']=='TV Show']
moviez=df[df['type']=='Movie']
sns.set(style="darkgrid")
sns.countplot(x="type", data=df, palette="Set3").set(title='Number of Content Divided by Type')
plt.show()
#examining release dates of shows
df1 = showz[['date_added']]
df1['year'] = df1['date_added'].apply(lambda x : x.split(', ')[-1])
df1['month'] = df1['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = df1.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(df, cmap='afmhot_r', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize=7, fontfamily='serif')
plt.title('Netflix Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()
cbar.ax.tick_params(labelsize=8)
cbar.ax.minorticks_on()
plt.show()
<ipython-input-8-c02f0ab80d67>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-8-c02f0ab80d67>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy findfont: Font family ['calibri'] not found. Falling back to DejaVu Sans.
# Examining parental advising ratings of movies
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
sns.countplot(x="rating", data=moviez, palette="Set2", order=moviez['rating'].value_counts().index[0:15])
plt.show()
# Feature engineering
#Reuploading data
df=pd.read_csv("netflix_titles.csv")
# Replacements
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['cast'].replace(np.nan, 'No Data',inplace = True)
df['director'].replace(np.nan, 'No Data',inplace = True)
# Drops
df.dropna(inplace=True)
# Drop Duplicates
df.drop_duplicates(inplace= True)
# Helper column for various plots
df['count'] = 1
# Grabbing the first country mentioned for production
df['first_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['first_country'].head()
# Updating Rating ages
ratings_ages = {
'TV-PG': 'Older Kids',
'TV-MA': 'Adults',
'TV-Y7-FV': 'Older Kids',
'TV-Y7': 'Older Kids',
'TV-14': 'Teens',
'R': 'Adults',
'TV-Y': 'Kids',
'NR': 'Adults',
'PG-13': 'Teens',
'TV-G': 'Kids',
'PG': 'Older Kids',
'G': 'Kids',
'UR': 'Adults',
'NC-17': 'Adults'
}
df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()
# Genre
df['genre'] = df['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(','))
# Reducing name length
df['first_country'].replace('United States', 'USA', inplace=True)
df['first_country'].replace('United Kingdom', 'UK',inplace=True)
df['first_country'].replace('South Korea', 'S. Korea',inplace=True)
# Creating a map showing content creation locations
country_codes = {'afghanistan': 'AFG',
'albania': 'ALB',
'algeria': 'DZA',
'american samoa': 'ASM',
'andorra': 'AND',
'angola': 'AGO',
'anguilla': 'AIA',
'antigua and barbuda': 'ATG',
'argentina': 'ARG',
'armenia': 'ARM',
'aruba': 'ABW',
'australia': 'AUS',
'austria': 'AUT',
'azerbaijan': 'AZE',
'bahamas': 'BHM',
'bahrain': 'BHR',
'bangladesh': 'BGD',
'barbados': 'BRB',
'belarus': 'BLR',
'belgium': 'BEL',
'belize': 'BLZ',
'benin': 'BEN',
'bermuda': 'BMU',
'bhutan': 'BTN',
'bolivia': 'BOL',
'bosnia and herzegovina': 'BIH',
'botswana': 'BWA',
'brazil': 'BRA',
'british virgin islands': 'VGB',
'brunei': 'BRN',
'bulgaria': 'BGR',
'burkina faso': 'BFA',
'burma': 'MMR',
'burundi': 'BDI',
'cabo verde': 'CPV',
'cambodia': 'KHM',
'cameroon': 'CMR',
'canada': 'CAN',
'cayman islands': 'CYM',
'central african republic': 'CAF',
'chad': 'TCD',
'chile': 'CHL',
'china': 'CHN',
'colombia': 'COL',
'comoros': 'COM',
'congo democratic': 'COD',
'Congo republic': 'COG',
'cook islands': 'COK',
'costa rica': 'CRI',
"cote d'ivoire": 'CIV',
'croatia': 'HRV',
'cuba': 'CUB',
'curacao': 'CUW',
'cyprus': 'CYP',
'czech republic': 'CZE',
'denmark': 'DNK',
'djibouti': 'DJI',
'dominica': 'DMA',
'dominican republic': 'DOM',
'ecuador': 'ECU',
'egypt': 'EGY',
'el salvador': 'SLV',
'equatorial guinea': 'GNQ',
'eritrea': 'ERI',
'estonia': 'EST',
'ethiopia': 'ETH',
'falkland islands': 'FLK',
'faroe islands': 'FRO',
'fiji': 'FJI',
'finland': 'FIN',
'france': 'FRA',
'french polynesia': 'PYF',
'gabon': 'GAB',
'gambia, the': 'GMB',
'georgia': 'GEO',
'germany': 'DEU',
'ghana': 'GHA',
'gibraltar': 'GIB',
'greece': 'GRC',
'greenland': 'GRL',
'grenada': 'GRD',
'guam': 'GUM',
'guatemala': 'GTM',
'guernsey': 'GGY',
'guinea-bissau': 'GNB',
'guinea': 'GIN',
'guyana': 'GUY',
'haiti': 'HTI',
'honduras': 'HND',
'hong kong': 'HKG',
'hungary': 'HUN',
'iceland': 'ISL',
'india': 'IND',
'indonesia': 'IDN',
'iran': 'IRN',
'iraq': 'IRQ',
'ireland': 'IRL',
'isle of man': 'IMN',
'israel': 'ISR',
'italy': 'ITA',
'jamaica': 'JAM',
'japan': 'JPN',
'jersey': 'JEY',
'jordan': 'JOR',
'kazakhstan': 'KAZ',
'kenya': 'KEN',
'kiribati': 'KIR',
'north korea': 'PRK',
'south korea': 'KOR',
'kosovo': 'KSV',
'kuwait': 'KWT',
'kyrgyzstan': 'KGZ',
'laos': 'LAO',
'latvia': 'LVA',
'lebanon': 'LBN',
'lesotho': 'LSO',
'liberia': 'LBR',
'libya': 'LBY',
'liechtenstein': 'LIE',
'lithuania': 'LTU',
'luxembourg': 'LUX',
'macau': 'MAC',
'macedonia': 'MKD',
'madagascar': 'MDG',
'malawi': 'MWI',
'malaysia': 'MYS',
'maldives': 'MDV',
'mali': 'MLI',
'malta': 'MLT',
'marshall islands': 'MHL',
'mauritania': 'MRT',
'mauritius': 'MUS',
'mexico': 'MEX',
'micronesia': 'FSM',
'moldova': 'MDA',
'monaco': 'MCO',
'mongolia': 'MNG',
'montenegro': 'MNE',
'morocco': 'MAR',
'mozambique': 'MOZ',
'namibia': 'NAM',
'nepal': 'NPL',
'netherlands': 'NLD',
'new caledonia': 'NCL',
'new zealand': 'NZL',
'nicaragua': 'NIC',
'nigeria': 'NGA',
'niger': 'NER',
'niue': 'NIU',
'northern mariana islands': 'MNP',
'norway': 'NOR',
'oman': 'OMN',
'pakistan': 'PAK',
'palau': 'PLW',
'panama': 'PAN',
'papua new guinea': 'PNG',
'paraguay': 'PRY',
'peru': 'PER',
'philippines': 'PHL',
'poland': 'POL',
'portugal': 'PRT',
'puerto rico': 'PRI',
'qatar': 'QAT',
'romania': 'ROU',
'russia': 'RUS',
'rwanda': 'RWA',
'saint kitts and nevis': 'KNA',
'saint lucia': 'LCA',
'saint martin': 'MAF',
'saint pierre and miquelon': 'SPM',
'saint vincent and the grenadines': 'VCT',
'samoa': 'WSM',
'san marino': 'SMR',
'sao tome and principe': 'STP',
'saudi arabia': 'SAU',
'senegal': 'SEN',
'serbia': 'SRB',
'seychelles': 'SYC',
'sierra leone': 'SLE',
'singapore': 'SGP',
'sint maarten': 'SXM',
'slovakia': 'SVK',
'slovenia': 'SVN',
'solomon islands': 'SLB',
'somalia': 'SOM',
'south africa': 'ZAF',
'south sudan': 'SSD',
'spain': 'ESP',
'sri lanka': 'LKA',
'sudan': 'SDN',
'suriname': 'SUR',
'swaziland': 'SWZ',
'sweden': 'SWE',
'switzerland': 'CHE',
'syria': 'SYR',
'taiwan': 'TWN',
'tajikistan': 'TJK',
'tanzania': 'TZA',
'thailand': 'THA',
'timor-leste': 'TLS',
'togo': 'TGO',
'tonga': 'TON',
'trinidad and tobago': 'TTO',
'tunisia': 'TUN',
'turkey': 'TUR',
'turkmenistan': 'TKM',
'tuvalu': 'TUV',
'uganda': 'UGA',
'ukraine': 'UKR',
'united arab emirates': 'ARE',
'united kingdom': 'GBR',
'united states': 'USA',
'uruguay': 'URY',
'uzbekistan': 'UZB',
'vanuatu': 'VUT',
'venezuela': 'VEN',
'vietnam': 'VNM',
'virgin islands': 'VGB',
'west bank': 'WBG',
'yemen': 'YEM',
'zambia': 'ZMB',
'zimbabwe': 'ZWE'}
## countries
from collections import Counter
colorscale = ["#f7fbff", "#ebf3fb", "#deebf7", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1",
"#85bcdb", "#6baed6", "#57a0ce", "#4292c6", "#3082be", "#2171b5", "#1361a9",
"#08519c", "#0b4083", "#08306b"
]
def geoplot(ddf):
country_with_code, country = {}, {}
shows_countries = ", ".join(ddf['country'].dropna()).split(", ")
for c,v in dict(Counter(shows_countries)).items():
code = ""
if c.lower() in country_codes:
code = country_codes[c.lower()]
country_with_code[code] = v
country[c] = v
data = [dict(
type = 'choropleth',
locations = list(country_with_code.keys()),
z = list(country_with_code.values()),
colorscale = [[0,"rgb(5, 10, 172)"],[0.65,"rgb(40, 60, 190)"],[0.75,"rgb(70, 100, 245)"],\
[0.80,"rgb(90, 120, 245)"],[0.9,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
autocolorscale = False,
reversescale = True,
marker = dict(
line = dict (
color = 'gray',
width = 0.5
) ),
colorbar = dict(
autotick = False,
title = ''),
) ]
layout = dict(
title = '',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(
type = 'Mercator'
)
)
)
fig = dict( data=data, layout=layout )
iplot( fig, validate=False, filename='d3-world-map' )
return country
country_vals = geoplot(df)
tabs = Counter(country_vals).most_common(25)
labels = [_[0] for _ in tabs][::-1]
values = [_[1] for _ in tabs][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="Countries with most content", height=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
#examining which countries produce most netflix content in a more quantitative form
data = df.groupby('first_country')['count'].sum().sort_values(ascending=False)[:10]
# Plot
color_map = ['#f5f5f1' for _ in range(10)]
color_map[0] = color_map[1] = color_map[2] = '#b20710' # color highlight
fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data.index, data, width=0.5,
edgecolor='darkgray',
linewidth=0.6,color=color_map)
#annotations
for i in data.index:
ax.annotate(f"{data[i]}",
xy=(i, data[i] + 150), #i like to change this to roughly 5% of the highest cat
va = 'center', ha='center',fontweight='light', fontfamily='serif')
# Remove border from plot
for s in ['top', 'left', 'right']:
ax.spines[s].set_visible(False)
# Tick labels
ax.set_xticklabels(data.index, fontfamily='serif', rotation=0)
# Title and sub-title
fig.text(0.09, 1, 'Top 10 countries on Netflix', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.09, 0.95, 'The three most frequent countries have been highlighted.', fontsize=12, fontweight='light', fontfamily='serif')
fig.text(1.1, 1.01, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(1.1, 0.67, '''
Most Netflix content are produced in
the USA, India, and the UK.
'''
, fontsize=12, fontweight='light', fontfamily='serif')
ax.grid(axis='y', linestyle='-', alpha=0.4)
grid_y_ticks = np.arange(0, 4000, 500) # y ticks, min, max, then step
ax.set_yticks(grid_y_ticks)
ax.set_axisbelow(True)
#Axis labels
#plt.xlabel("Country", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5)
#plt.ylabel("Count", fontsize=12, fontweight='light', fontfamily='serif')
#plt.legend(loc='upper right')
# thicken the bottom line if you want to
plt.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7)
ax.tick_params(axis='both', which='major', labelsize=12)
import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])
ax.tick_params(axis=u'both', which=u'both',length=0)
plt.show()
<ipython-input-12-9084f6261040>:30: UserWarning: FixedFormatter should only be used together with FixedLocator
#examining type of production by country
country_order = df['first_country'].value_counts()[:11].index
data_q2q3 = df[['type', 'first_country']].groupby('first_country')['type'].value_counts().unstack().loc[country_order]
data_q2q3['sum'] = data_q2q3.sum(axis=1)
data_q2q3_ratio = (data_q2q3.T / data_q2q3['sum']).T[['Movie', 'TV Show']].sort_values(by='Movie',ascending=False)[::-1]
###
fig, ax = plt.subplots(1,1,figsize=(15, 8),)
ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['Movie'],
color='#b20710', alpha=0.8, label='Movie')
ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['TV Show'], left=data_q2q3_ratio['Movie'],
color='#221f1f', alpha=0.8, label='TV Show')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticklabels(data_q2q3_ratio.index, fontfamily='serif', fontsize=11)
# male percentage
for i in data_q2q3_ratio.index:
ax.annotate(f"{data_q2q3_ratio['Movie'][i]*100:.3}%",
xy=(data_q2q3_ratio['Movie'][i]/2, i),
va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
color='white')
for i in data_q2q3_ratio.index:
ax.annotate(f"{data_q2q3_ratio['TV Show'][i]*100:.3}%",
xy=(data_q2q3_ratio['Movie'][i]+data_q2q3_ratio['TV Show'][i]/2, i),
va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
color='white')
fig.text(0.13, 0.93, 'Top 10 countries Movie & TV Show split', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.131, 0.89, 'Percent Stacked Bar Chart', fontsize=12,fontfamily='serif')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06))
fig.text(0.75,0.9,"Movie", fontweight="bold", fontfamily='serif', fontsize=15, color='#b20710')
fig.text(0.81,0.9,"|", fontweight="bold", fontfamily='serif', fontsize=15, color='black')
fig.text(0.82,0.9,"TV Show", fontweight="bold", fontfamily='serif', fontsize=15, color='#221f1f')
fig.text(1.1, 0.93, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(1.1, 0.44, '''
Netflix in India
is made up nearly entirely of Movies.
South Korean Netflix on the other hand is
almost entirely TV Shows.
'''
, fontsize=12, fontweight='light', fontfamily='serif')
import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis=u'both', which=u'both',length=0)
plt.show()
<ipython-input-13-a3d6c36ae73b>:22: UserWarning: FixedFormatter should only be used together with FixedLocator